# Direct outputs not eligible for public release

# E.g., to run
# nohup R CMD BATCH --no-save --no-restore '--args outcomes=c("per_adult_total_empl_income","per_adult_labor_tax_and_transfer","implied_consumption_cap")' & #nolint

args <- (commandArgs(TRUE))
if (length(args) > 0) {
    for (i in 1:length(args)) {
        eval(parse(text = args[[i]]))
    }
}


# GLOBAL SETTINGS --------------------------------------------------------------

options(
    scipen = 999,
    digits = 16,
    max.print = .Machine$integer.max,
    show.error.locations = TRUE,
    warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------
library(data.table)
library(Matrix)
library(zoo)
library(multiwayvcov)
library(lmtest)
library(checkmate)
library(futile.logger)
library(lfe)

library(remotes)
remotes::install_github("setzler/eventStudy/eventStudy")
library(eventStudy) # for DiD-IV

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1L)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# BEGIN FILE -------------------------------------------------------------------

source("~/code/0-utility-functions/wald_es.R", local = TRUE)
source("~/code/0-utility-functions/mathematical-utils.R", local = TRUE)

MakeUnearnedIncomeEffects <- function(outcome) {

    # Read in lottery panel data
    lottery_panel <- readRDS("~/population-panel-data/lottery_panel_data.rds")

    # For all outcomes, replace with zero if missing
    if (
        (class(lottery_panel[[outcome]]) == "integer") ||
            (class(lottery_panel[[outcome]]) == "numeric")
    ) {
        lottery_panel[is.na(get(outcome)), (outcome) := 0]
    }

    # Similarly, for AGI (used to construct quartiles)
    lottery_panel[is.na(per_adult_adjgross), per_adult_adjgross := 0]

    # Each treated cohort will be restricted to age 21-64 in year 0 and
    # only control units age 21-64 in the same calendar year will be used
    lottery_panel[
        ,
        age_case := as.logical(between(age, 21, 64, incbounds = TRUE))
    ]

    # Produce deliberately aggregated estimates
    collapse_table <-
        data.table(
            a = c("one_to_two", "three_to_five", "post_avg"),
            b = c(list(1:2), list(3:5), list(1:5))
        )

    # Run stacked event-study regression
    did_results_temp <-
        Wald_ES2(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = "age",
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            heterogeneous_only = TRUE,
            anticipation = 1,
            endog_var = "L_capitalization",
            calculate_collapse_estimates = TRUE,
            collapse_inputs = collapse_table
        )

    did_results_q1_temp <-
        Wald_ES2(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = "age",
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            heterogeneous_only = TRUE,
            anticipation = 1,
            endog_var = "L_capitalization",
            calculate_collapse_estimates = TRUE,
            collapse_inputs = collapse_table,
            ntile_var = "per_adult_adjgross",
            ntile_event_time = -2,
            ntiles = 4,
            ntile_var_value = 1,
            ntile_avg = FALSE
        )

    did_results_q2_temp <-
        Wald_ES2(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = "age",
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            heterogeneous_only = TRUE,
            anticipation = 1,
            endog_var = "L_capitalization",
            calculate_collapse_estimates = TRUE,
            collapse_inputs = collapse_table,
            ntile_var = "per_adult_adjgross",
            ntile_event_time = -2,
            ntiles = 4,
            ntile_var_value = 2,
            ntile_avg = FALSE
        )

    did_results_q3_temp <-
        Wald_ES2(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = "age",
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            heterogeneous_only = TRUE,
            anticipation = 1,
            endog_var = "L_capitalization",
            calculate_collapse_estimates = TRUE,
            collapse_inputs = collapse_table,
            ntile_var = "per_adult_adjgross",
            ntile_event_time = -2,
            ntiles = 4,
            ntile_var_value = 3,
            ntile_avg = FALSE
        )

    did_results_q4_temp <-
        Wald_ES2(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = "age",
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            heterogeneous_only = TRUE,
            anticipation = 1,
            endog_var = "L_capitalization",
            calculate_collapse_estimates = TRUE,
            collapse_inputs = collapse_table,
            ntile_var = "per_adult_adjgross",
            ntile_event_time = -2,
            ntiles = 4,
            ntile_var_value = 4,
            ntile_avg = FALSE
        )

    # Above results contain more than is needed for this step, so focus them
    # with a quick subsetting function

    MakeMainIncEstimates <- function(did_dt) {

        # main per-period income change estimates
        dincome_dt <-
            setDT(did_dt[[1]])[
                rn == "att" &
                    model == "first_stage" &
                    ref_onset_time %in%
                        c("Cohort-Weighted", "Cohort-Weighted + Collapsed"),
                .(ref_onset_time, ref_event_time, estimate, cluster_se, model, grouping)
            ]

        # main per-period outcome change estimates
        doutcome_dt <-
            setDT(did_dt[[1]])[
                rn == "att" &
                    model == "reduced_form" &
                    ref_onset_time %in%
                        c("Cohort-Weighted", "Cohort-Weighted + Collapsed"),
                .(ref_onset_time, ref_event_time, estimate, cluster_se, model, grouping)
            ]

        # main ratio estimates
        # use Pacini and Windmeijer (2016) with ratio of means
        # given additional imprecision in capitalization approach

        dratio_dt <-
            copy(doutcome_dt[, .(ref_onset_time, ref_event_time, estimate, grouping)])
        setnames(dratio_dt, "estimate", "rf_estimate")

        dratio_dt <-
            merge(
                dratio_dt,
                dincome_dt[, .(ref_onset_time, ref_event_time, estimate, grouping)],
                by = c("ref_onset_time", "ref_event_time", "grouping")
            )
        setnames(dratio_dt, "estimate", "fs_estimate")

        dratio_dt[, estimate := rf_estimate / fs_estimate]
        dratio_dt[, c("rf_estimate", "fs_estimate") := NULL]

        dincome_dt <- dincome_dt[ref_onset_time == "Cohort-Weighted"]
        dincome_dt[, grouping := NULL]
        doutcome_dt <- doutcome_dt[ref_onset_time == "Cohort-Weighted"]
        doutcome_dt[, grouping := NULL]

        # Start with cohort-weighted event-time estimates
        r_of_m_coefs_fs <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                estimate
            ]

        names(r_of_m_coefs_fs) <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]

        r_of_m_coefs_rf <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                estimate
            ]

        names(r_of_m_coefs_rf) <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]

        diag_fs <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                (cluster_se^2)
            ]

        r_of_m_vcov_fs <- as.matrix(diag(x = diag_fs, nrow = length(diag_fs)))

        dimnames(r_of_m_vcov_fs)[[1]] <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]
        dimnames(r_of_m_vcov_fs)[[2]] <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]

        diag_rf <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                (cluster_se^2)
            ]

        r_of_m_vcov_rf <- as.matrix(diag(x = diag_rf, nrow = length(diag_rf)))

        dimnames(r_of_m_vcov_rf)[[1]] <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]
        dimnames(r_of_m_vcov_rf)[[2]] <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted",
                ref_event_time
            ]

        ratio_of_means_vcov <-
            makeRatioVcov(
                coefs_fs = r_of_m_coefs_fs,
                coefs_rf = r_of_m_coefs_rf,
                vcov_fs = r_of_m_vcov_fs,
                vcov_rf = r_of_m_vcov_rf
            )
        dimnames(ratio_of_means_vcov) <- dimnames(r_of_m_vcov_rf)

        r_of_m_coefs_fs <- NULL
        r_of_m_coefs_rf <- NULL
        diag_fs <- NULL
        r_of_m_vcov_fs <- NULL
        diag_rf <- NULL
        r_of_m_vcov_rf <- NULL
        rm(
            r_of_m_coefs_fs,
            r_of_m_coefs_rf,
            diag_fs,
            r_of_m_vcov_fs,
            diag_rf,
            r_of_m_vcov_rf
        )

        # Similarly for collapsed estimates
        r_of_m_coefs_fs <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                estimate
            ]
        names(r_of_m_coefs_fs) <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]

        r_of_m_coefs_rf <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                estimate
            ]
        names(r_of_m_coefs_rf) <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]

        diag_fs <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                (cluster_se^2)
            ]
        r_of_m_vcov_fs <- as.matrix(diag(x = diag_fs, nrow = length(diag_fs)))

        dimnames(r_of_m_vcov_fs)[[1]] <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]
        dimnames(r_of_m_vcov_fs)[[2]] <-
            did_dt[[1]][
                model == "first_stage" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]

        diag_rf <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                (cluster_se^2)
            ]
        r_of_m_vcov_rf <- as.matrix(diag(x = diag_rf, nrow = length(diag_rf)))
        dimnames(r_of_m_vcov_rf)[[1]] <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]
        dimnames(r_of_m_vcov_rf)[[2]] <-
            did_dt[[1]][
                model == "reduced_form" &
                    rn == "att" &
                    ref_onset_time == "Cohort-Weighted + Collapsed",
                grouping
            ]

        collapsed_ratio_of_means_vcov <-
            makeRatioVcov(
                coefs_fs = r_of_m_coefs_fs,
                coefs_rf = r_of_m_coefs_rf,
                vcov_fs = r_of_m_vcov_fs,
                vcov_rf = r_of_m_vcov_rf
            )
        dimnames(collapsed_ratio_of_means_vcov) <- dimnames(r_of_m_vcov_rf)

        r_of_m_coefs_fs <- NULL
        r_of_m_coefs_rf <- NULL
        diag_fs <- NULL
        r_of_m_vcov_fs <- NULL
        diag_rf <- NULL
        r_of_m_vcov_rf <- NULL
        rm(
            r_of_m_coefs_fs,
            r_of_m_coefs_rf,
            diag_fs,
            r_of_m_vcov_fs,
            diag_rf,
            r_of_m_vcov_rf
        )

        # Now fill-in SEs
        ratio_of_means_ses <-
            data.table(
                ref_onset_time = "Cohort-Weighted",
                ref_event_time = as.integer(names(sqrt(diag(ratio_of_means_vcov)))),
                model = "ratio",
                grouping = NA,
                cluster_se = sqrt(diag(ratio_of_means_vcov))
            )
        rm(ratio_of_means_vcov)

        collapsed_ratio_of_means_ses <-
            data.table(
                ref_onset_time = "Cohort-Weighted + Collapsed",
                ref_event_time = NA,
                model = "ratio",
                grouping = names(sqrt(diag(collapsed_ratio_of_means_vcov))),
                cluster_se = sqrt(diag(collapsed_ratio_of_means_vcov))
            )
        rm(collapsed_ratio_of_means_vcov)

        dratio_ses <-
            rbindlist(
                list(ratio_of_means_ses, collapsed_ratio_of_means_ses),
                use.names = TRUE
            )

        ratio_of_means_ses <- NULL
        collapsed_ratio_of_means_ses <- NULL
        rm(ratio_of_means_ses, collapsed_ratio_of_means_ses)

        dratio_dt <-
            merge(
                dratio_dt,
                dratio_ses,
                by = c("ref_onset_time", "ref_event_time", "grouping")
            )

        dratio_dt[, model := "ratio"]
        dratio_ses <- NULL
        rm(dratio_ses)

        # Will use a 97, 98, 99 convention to get
        # the right order of results in tables
        dratio_dt[
            grouping %in% c("one_to_two", "first_two"),
            ref_event_time := 97
        ]
        dratio_dt[
            grouping == "three_to_five",
            ref_event_time := 98
        ]
        dratio_dt[grouping == "post_avg", ref_event_time := 99]
        dratio_dt[, grouping := NULL]

        combined <-
            rbindlist(
                list(
                    dincome_dt,
                    doutcome_dt,
                    dratio_dt
                ),
                use.names = TRUE
            )
        return(combined)
    }

    did_results <- MakeMainIncEstimates(did_dt = did_results_temp)
    did_results_q1 <- MakeMainIncEstimates(did_dt = did_results_q1_temp)
    did_results_q2 <- MakeMainIncEstimates(did_dt = did_results_q2_temp)
    did_results_q3 <- MakeMainIncEstimates(did_dt = did_results_q3_temp)
    did_results_q4 <- MakeMainIncEstimates(did_dt = did_results_q4_temp)

    # Combine together and identify with the quartile (if relevant)
    # - will use income_quartile = 5 for the aggregate estimates
    did_results[, income_quartile := 5L]
    did_results_q1[, income_quartile := 1L]
    did_results_q2[, income_quartile := 2L]
    did_results_q3[, income_quartile := 3L]
    did_results_q4[, income_quartile := 4L]

    # Only need quartile-specific estimates for ratio
    did_results_q1 <- did_results_q1[model == "ratio"]
    did_results_q2 <- did_results_q2[model == "ratio"]
    did_results_q3 <- did_results_q3[model == "ratio"]
    did_results_q4 <- did_results_q4[model == "ratio"]

    did_results_combined <-
        rbindlist(
            list(
                did_results,
                did_results_q1,
                did_results_q2,
                did_results_q3,
                did_results_q4
            ),
            use.names = TRUE
        )
    did_results_combined[, ref_onset_time := NULL]

    # Restrict attention to:
    # -7 to +5 for total effects (model != "ratio")
    # +1 to +5 (and collapsed) (model == "ratio")
    did_results_combined <-
        did_results_combined[
            (model != "ratio" & ref_event_time %in% c(-7:5)) |
                (model == "ratio" & ref_event_time %in% c(1:5, 97:99))
        ]

    saveRDS(
        did_results_combined,
        sprintf("~/estimation-output/unearned_income_effect_estimates_%s.rds", outcome) # nolint
    )

    return(outcome)
}

num_cores <- length(outcomes)
mcmapply(
    MakeUnearnedIncomeEffects,
    outcomes,
    SIMPLIFY = FALSE,
    mc.silent = FALSE,
    mc.cores = num_cores,
    mc.set.seed = TRUE
)
